Using dplyr, we will do some data reshaping and render visualizations of the results with the USAFacts datasets.

USAFacts provides data on cases, deaths, and population sizes for download at the county level at the following URL: https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/

Code Organization

0. Dependencies
1a. Load data
1b. Cleaning data
2a. Reshape data
2b. De-cumulate data
3. Merge data
4. Summarize into weekly case and death counts and per capita incidence
5. Examine trends within a state
    - Plot county trends
    - Plot summary county trends

Our data analysis goals include the following:

Our pedagogical goals include the following:

Some caveats that must be given:

References:

  1. https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/
  2. https://usafacts.org/articles/detailed-methodology-covid-19-data/
  3. https://en.m.wikipedia.org/wiki/Dplyr
  4. https://dplyr.tidyverse.org/
  5. https://r4ds.had.co.nz/transform.html
  6. https://deanattali.com/2015/03/24/knitrs-best-hidden-gem-spin/
# 0. Dependencies ---------------------------------------------------------
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# 1a. Load data ------------------------------------------------------------

cases <- readr::read_csv("covid_confirmed_usafacts.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   StateFIPS = col_character()
## )
## See spec(...) for full column specifications.
deaths <- readr::read_csv("covid_deaths_usafacts.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   StateFIPS = col_character()
## )
## See spec(...) for full column specifications.
popsize <- readr::read_csv("covid_county_population_usafacts.csv")
## Parsed with column specification:
## cols(
##   countyFIPS = col_double(),
##   `County Name` = col_character(),
##   State = col_character(),
##   population = col_double()
## )
# 1b. Clean data ----------------------------------------------------------

# an aside on how %>% and %<>% work:

# f(x) and x %>% f() are equivalent expressions (meaning they will do the same
# things).
# similarly, f(x,y) and x %>% f(y) are equivalent expressions. 

# f <- f(x) and x %<>% f() are equivalent expressions.
# similarly, x <- f(x,y) and x %<>% f(y) are equivalent expressions.

# example 
# head(mtcars)
# mtcars %>% head()

# head(mtcars, 3)
# mtcars %>% head(3) 

# example of copying mtcars and then using head to replace it with the 
# first 3 rows 
# example_df <- mtcars
# example_df %<>% head(3)

# if you need to refer to the "left hand variable", you can do so 
# by referring to it with a period (as in .)
# example:
# iris %>% subset(1:nrow(.) %% 2 == 0)

# you can access the help pages via ?`%>%` or ?`%<>%`


# clean column names 
cases %<>% janitor::clean_names()
deaths %<>% janitor::clean_names()
popsize %<>% janitor::clean_names()


# 2a. Reshape data ---------------------------------------------------------

# pivot into a long format so that each row contains a unique observation
cases %<>% pivot_longer(
  cols = colnames(.)[5:ncol(.)],
  names_to = 'date',
  values_to = 'cases')

deaths %<>% pivot_longer(
  cols = colnames(.)[5:ncol(.)],
  names_to = 'date',
  values_to = 'deaths')
  
# now that the data are in long format, let's clean the date field
cases$date %<>% 
  stringr::str_remove_all("x") %>% 
  lubridate::ymd()

deaths$date %<>% 
  stringr::str_remove_all("x") %>% 
  lubridate::ymd()

# 2b. De-cumulate data ----------------------------------------------------

# create a second dataset that has dates for one week ahead 
# and rename variables appropriately
cases_1week_behind <- cases %>% 
  mutate(one_week_ahead_date = date + 7) %>% 
  rename(cases_one_week_lagged = cases) %>% 
  select(-date) 

# merge in cases from 1 week ahead
cases %<>% left_join(
  cases_1week_behind,
  by = c(
    'date' = 'one_week_ahead_date', 
    'county_fips', 
    'county_name', 
    'state', 
    'state_fips'))

# calculate last 7 days of cases from cumulative totals
cases %<>% mutate(
  last_7days_of_cases = cases - cases_one_week_lagged)

# filter because I only want one day per week of observations
cases %<>% mutate(
  weekday = lubridate::wday(date)) %>% 
  filter(weekday == 1) # e.g. sundays


# repeat steps for deaths, starting with creating the lagged dataset:
deaths_1week_behind <- deaths %>% 
  mutate(one_week_ahead_date = date + 7) %>% 
  rename(deaths_one_week_lagged = deaths) %>% 
  select(-date) 

# merge in lagged dataset
deaths %<>% left_join(
  deaths_1week_behind,
  by = c(
    'date' = 'one_week_ahead_date', 
    'county_fips', 
    'county_name', 
    'state', 
    'state_fips'))

# calculate last 7 days of deaths from cumulative totals
deaths %<>% mutate(
  last_7days_of_deaths = deaths - deaths_one_week_lagged)

# filter because I only want one day per week of observations
deaths %<>% mutate(
  weekday = lubridate::wday(date)) %>% 
  filter(weekday == 1) # e.g. sundays

# for both datasets, drop the columns of data we won't use (the cumulative
# totals) now that we've made our 7 day counts
cases %<>% select(-c(cases, cases_one_week_lagged))
deaths %<>% select(-c(deaths, deaths_one_week_lagged))

# 3. Merge data -----------------------------------------------------------

# we'll drop state_fips here since we won't be using it and it isn't 
# present in the population size dataset -- 
# we also won't be using weekday anymore, so we'll drop that too

cases %<>% select(-c(state_fips, weekday))
deaths %<>% select(-c(state_fips, weekday))

# merge population sizes into the cases data
df <- 
  left_join(
    cases,
    popsize,
    by = c('county_fips', 'county_name', 'state')) 

df %<>% 
  left_join(
    deaths,
    by = c('date', 'county_fips', 'county_name', 'state')) 

# 4. Summarize into weekly cumulative incidence proportions ---------------

df %<>% mutate(
  weekly_cumulative_cases_per_100k = last_7days_of_cases / population * 1e5,
  weekly_cumulative_deaths_per_100k = last_7days_of_deaths / population * 1e5,
)

# 5. Examine trends within a state ----------------------------------------

# weekly cases plot
df %>% 
  filter(state == 'MA') %>% 
  filter(county_name != 'Statewide Unallocated') %>% 
  ggplot(
    aes(
      x = date, 
      y = weekly_cumulative_cases_per_100k, 
      color = county_name)) + 
  geom_line() + 
  ylab("Weekly Cases per 100,000 Residents") + 
  xlab("Date") + 
  labs(color = 'County Name') + 
  ggtitle("Weekly Cases per 100,000 Residents, Massachusetts")
## Warning: Removed 104 row(s) containing missing values (geom_path).

# weekly deaths plot
df %>% 
  filter(state == 'MA') %>% 
  filter(county_name != 'Statewide Unallocated') %>% 
  ggplot(
    aes(
      x = date, 
      y = weekly_cumulative_deaths_per_100k, 
      color = county_name)) + 
  geom_line() + 
  ylab("Weekly Deaths per 100,000 Residents") + 
  xlab("Date") + 
  labs(color = 'County Name') + 
  ggtitle("Weekly Deaths per 100,000 Residents, Massachusetts")
## Warning: Removed 104 row(s) containing missing values (geom_path).

# a function for plotting weekly cases per 100k by state
plot_weekly_cases_per_100k_by_state <- function(state_abbrev) { 
  
  df %>% 
    filter(state == state_abbrev) %>% 
    filter(county_name != 'Statewide Unallocated') %>% 
    ggplot(
      aes(
        x = date, 
        y = weekly_cumulative_cases_per_100k, 
        color = county_name)) + 
    geom_line() + 
    ylab("Weekly Cases per 100,000 Residents") + 
    xlab("Date") + 
    labs(color = 'County Name') + 
    ggtitle(str_c("Weekly Cases per 100,000 Residents, ", state_abbrev)) + 
    theme(legend.position = 'bottom')
}
plot_weekly_cases_per_100k_by_state("NY")
## Warning: Removed 62 row(s) containing missing values (geom_path).

# a new version of the prior function, now with averages and confidence
# intervals by week
plot_weekly_cases_per_100k_smoothed <- function(state_abbrev) {
  df %>% 
    filter(state == state_abbrev) %>% 
    filter(county_name != 'Statewide Unallocated') %>% 
    group_by(date) %>% 
    summarize(
      mean_cases_per_100k = mean(weekly_cumulative_cases_per_100k, na.rm=T),
      ci_high_cases_per_100k = quantile(weekly_cumulative_cases_per_100k, 0.975, na.rm=T),
      ci_low_cases_per_100k = quantile(weekly_cumulative_cases_per_100k, 0.025, na.rm=T)
    ) %>% 
    ggplot(
      aes(
        x = date, 
        y = mean_cases_per_100k, 
        ymax = ci_high_cases_per_100k,
        ymin = ci_low_cases_per_100k)) + 
    geom_ribbon(size = 0, fill = 'cadetblue', alpha = 0.7) + 
    geom_line() + 
    ylab("Weekly Cases per 100,000 Residents") + 
    xlab("Date") + 
    ggtitle(str_c("Weekly Cases per 100,000 Residents, ", state_abbrev),
            subtitle = "Average and 95% Quantile Range Shown Among Weekly County Cases per 100,000")
}
plot_weekly_cases_per_100k_smoothed('MA')
## Warning: Removed 1 row(s) containing missing values (geom_path).

plot_weekly_cases_per_100k_smoothed('NY')
## Warning: Removed 1 row(s) containing missing values (geom_path).

plot_weekly_cases_per_100k_smoothed('TX')
## Warning: Removed 1 row(s) containing missing values (geom_path).

plot_weekly_cases_per_100k_smoothed('FL')
## Warning: Removed 1 row(s) containing missing values (geom_path).

# use plotly to create an interactive version
plot_weekly_cases_per_100k_smoothed('FL') %>% 
  ggplotly()